
template <typename T0, typename T1>
INLINE size_t memtran(T0 *dest, T1 *src, size_t size) {
  memcpy(dest, src, size);
  return size;
}

#ifdef __sw__
#include "sw/swarch.h"
size_t pack_brick_most_sw(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi);
size_t unpack_brick_most_sw(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi);
size_t pack_brick_pre_sw_forward_x(char *buf, cellgrid_t *grid, vec_pack_param_t *);
size_t unpack_brick_pre_sw_forward_x(char *buf, cellgrid_t *grid, vec_pack_param_t *);
size_t pack_brick_pre_sw_reverse_f(char *buf, cellgrid_t *grid, vec_pack_param_t *);
size_t unpack_brick_pre_sw_reverse_f(char *buf, cellgrid_t *grid, vec_pack_param_t *);
size_t pack_brick_pre_sw_forward_shake(char *buf, cellgrid_t *grid, vec_pack_param_t *);
size_t unpack_brick_pre_sw_forward_shake(char *buf, cellgrid_t *grid, vec_pack_param_t *);
size_t pack_brick_export_sw(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi);
size_t unpack_brick_export_sw(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi);
#endif
#define pack_field(ptr, field, n) (ptr) += memtran((ptr), (field), sizeof(*(field)) * (n))
#define unpack_field(ptr, field, n) (ptr) += memtran((field), (ptr), sizeof(*(field)) * (n))
#define COMM_ISEND(mpp, size, dir, axis) \
  MPI_Isend((mpp)->send_##dir, (size), MPI_BYTE, (mpp)->dir.axis, stag_##dir, (mpp)->comm, &(mpp)->send_req_##dir)
#define COMM_IRECV(mpp, dir, axis) \
  MPI_Irecv((mpp)->recv_##dir, (mpp)->max_comm_size, MPI_BYTE, (mpp)->dir.axis, rtag_##dir, (mpp)->comm, &(mpp)->recv_req_##dir)
#define COMM_WAITALL(mpp)                                        \
  {                                                              \
    MPI_Wait(&((mpp)->send_req_prev), &((mpp)->send_stat_prev)); \
    MPI_Wait(&((mpp)->send_req_next), &((mpp)->send_stat_next)); \
    MPI_Wait(&((mpp)->recv_req_prev), &((mpp)->recv_stat_prev)); \
    MPI_Wait(&((mpp)->recv_req_next), &((mpp)->recv_stat_next)); \
  }


size_t pack_cell_forward_most(char *buf, celldata_t *cell){
  *(long*)buf = cell->natom;
  char *ptr = buf + 8;
  pack_field(ptr, cell->tag, cell->natom);
  pack_field(ptr, cell->x, cell->natom);
  pack_field(ptr, cell->q, cell->natom);
  pack_field(ptr, cell->t, cell->natom);
  pack_field(ptr, cell->mass, cell->natom);
  pack_field(ptr, cell->rmass, cell->natom);
  return ptr - buf;
}
//['tag', 'x', 'q', 't', 'mass', 'rmass']
size_t unpack_cell_forward_most(char *buf, celldata_t *cell){
  cell->natom = *(long*)buf;
  char *ptr = buf + 8;
  unpack_field(ptr, cell->tag, cell->natom);
  unpack_field(ptr, cell->x, cell->natom);
  unpack_field(ptr, cell->q, cell->natom);
  unpack_field(ptr, cell->t, cell->natom);
  unpack_field(ptr, cell->mass, cell->natom);
  unpack_field(ptr, cell->rmass, cell->natom);
  return ptr - buf;
}

size_t pack_brick_forward_most(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  char *ptr = buf;
  for (int i = xlo; i < xhi; i ++) {
    for (int j = ylo; j < yhi; j ++) {
      for (int k = zlo; k < zhi; k ++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += pack_cell_forward_most(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

size_t unpack_brick_forward_most(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  char *ptr = buf;
  for (int i = xlo; i < xhi; i ++) {
    for (int j = ylo; j < yhi; j ++) {
      for (int k = zlo; k < zhi; k ++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += unpack_cell_forward_most(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

void forward_comm_most(cellgrid_t *grid, mpp_t *mpp) {
  vec<int> *lo = &(grid->dim.lo);
  vec<int> *hi = &(grid->dim.hi);
  vec<int> *nlocal = &(grid->nlocal);
  int nn = grid->nn;

  size_t nsend_prev, nsend_next;
  #ifdef __sw__
  #define pack_brick_forward_most pack_brick_most_sw
  #define unpack_brick_forward_most unpack_brick_most_sw
  #endif

  COMM_IRECV(mpp, prev, z);
  COMM_IRECV(mpp, next, z);
  nsend_prev = pack_brick_forward_most(mpp->send_prev, grid, 0, nlocal->x, 0, nlocal->y, 0, nn);
  nsend_next = pack_brick_forward_most(mpp->send_next, grid, 0, nlocal->x, 0, nlocal->y, nlocal->z - nn, nlocal->z);
  COMM_ISEND(mpp, nsend_prev, prev, z);
  COMM_ISEND(mpp, nsend_next, next, z);
  COMM_WAITALL(mpp);
  unpack_brick_forward_most(mpp->recv_prev, grid, 0, nlocal->x, 0, nlocal->y, lo->z, lo->z + nn);
  unpack_brick_forward_most(mpp->recv_next, grid, 0, nlocal->x, 0, nlocal->y, hi->z - nn, hi->z);

  COMM_IRECV(mpp, prev, y);
  COMM_IRECV(mpp, next, y);
  nsend_prev = pack_brick_forward_most(mpp->send_prev, grid, 0, nlocal->x, 0, nn, lo->z, hi->z);
  nsend_next = pack_brick_forward_most(mpp->send_next, grid, 0, nlocal->x, nlocal->y - nn, nlocal->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, y);
  COMM_ISEND(mpp, nsend_next, next, y);
  COMM_WAITALL(mpp);
  unpack_brick_forward_most(mpp->recv_prev, grid, 0, nlocal->x, lo->y, lo->y + nn, lo->z, hi->z);
  unpack_brick_forward_most(mpp->recv_next, grid, 0, nlocal->x, hi->y - nn, hi->y, lo->z, hi->z);

  COMM_IRECV(mpp, prev, x);
  COMM_IRECV(mpp, next, x);
  nsend_prev = pack_brick_forward_most(mpp->send_prev, grid, 0, nn, lo->y, hi->y, lo->z, hi->z);
  nsend_next = pack_brick_forward_most(mpp->send_next, grid, nlocal->x - nn, nlocal->x, lo->y, hi->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, x);
  COMM_ISEND(mpp, nsend_next, next, x);
  COMM_WAITALL(mpp);
  unpack_brick_forward_most(mpp->recv_prev, grid, lo->x, lo->x + nn, lo->y, hi->y, lo->z, hi->z);
  unpack_brick_forward_most(mpp->recv_next, grid, hi->x - nn, hi->x, lo->y, hi->y, lo->z, hi->z);
}

size_t pack_cell_forward_x(char *buf, celldata_t *cell){
  char *ptr = buf;
  pack_field(ptr, cell->x, cell->natom);
  return ptr - buf;
}
//['x']
size_t unpack_cell_forward_x(char *buf, celldata_t *cell){
  char *ptr = buf;
  unpack_field(ptr, cell->x, cell->natom);
  return ptr - buf;
}

size_t pack_brick_forward_x(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  char *ptr = buf;
  for (int i = xlo; i < xhi; i ++) {
    for (int j = ylo; j < yhi; j ++) {
      for (int k = zlo; k < zhi; k ++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += pack_cell_forward_x(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

size_t unpack_brick_forward_x(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  char *ptr = buf;
  for (int i = xlo; i < xhi; i ++) {
    for (int j = ylo; j < yhi; j ++) {
      for (int k = zlo; k < zhi; k ++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += unpack_cell_forward_x(ptr, cell);
      }
    }
  }
  return ptr - buf;
}
#include "timer.h"
DEF_TIMER(PACKX, "comm/packx")
DEF_TIMER(UNPACKX, "comm/unpackx")
// #undef SW5
#ifdef __sw__
#include "sw/swarch.h"
#endif
void forward_comm_x(cellgrid_t *grid, mpp_t *mpp) {
  vec<int> *lo = &(grid->dim.lo);
  vec<int> *hi = &(grid->dim.hi);
  vec<int> *nlocal = &(grid->nlocal);
  int nn = grid->nn;
  #ifdef __sw__
  sw_archdata_t *archdata = (sw_archdata_t*)grid->arch_data;
  #endif
  size_t nsend_prev, nsend_next;

  COMM_IRECV(mpp, prev, z);
  COMM_IRECV(mpp, next, z);
  timer_start(PACKX);
  #ifdef __sw__
  nsend_prev = pack_brick_pre_sw_forward_x(mpp->send_prev, grid, archdata->pack_params + PACK_FWD_NEG_Z);
  nsend_next = pack_brick_pre_sw_forward_x(mpp->send_next, grid, archdata->pack_params + PACK_FWD_POS_Z);
  #else
  nsend_prev = pack_brick_forward_x(mpp->send_prev, grid, 0, nlocal->x, 0, nlocal->y, 0, nn);
  nsend_next = pack_brick_forward_x(mpp->send_next, grid, 0, nlocal->x, 0, nlocal->y, nlocal->z - nn, nlocal->z);
  #endif
  timer_stop(PACKX);
  COMM_ISEND(mpp, nsend_prev, prev, z);
  COMM_ISEND(mpp, nsend_next, next, z);
  COMM_WAITALL(mpp);
  timer_start(UNPACKX);
  #ifdef __sw__
  unpack_brick_pre_sw_forward_x(mpp->recv_prev, grid, archdata->pack_params + UNPACK_FWD_NEG_Z);
  unpack_brick_pre_sw_forward_x(mpp->recv_next, grid, archdata->pack_params + UNPACK_FWD_POS_Z);
  #else
  unpack_brick_forward_x(mpp->recv_prev, grid, 0, nlocal->x, 0, nlocal->y, lo->z, lo->z + nn);
  unpack_brick_forward_x(mpp->recv_next, grid, 0, nlocal->x, 0, nlocal->y, hi->z - nn, hi->z);
  #endif
  timer_stop(UNPACKX);
  COMM_IRECV(mpp, prev, y);
  COMM_IRECV(mpp, next, y);
  timer_start(PACKX);
  #ifdef __sw__
  // puts("PACKY");
  nsend_prev = pack_brick_pre_sw_forward_x(mpp->send_prev, grid, archdata->pack_params + PACK_FWD_NEG_Y);
  nsend_next = pack_brick_pre_sw_forward_x(mpp->send_next, grid, archdata->pack_params + PACK_FWD_POS_Y);
  #else
  nsend_prev = pack_brick_forward_x(mpp->send_prev, grid, 0, nlocal->x, 0, nn, lo->z, hi->z);
  nsend_next = pack_brick_forward_x(mpp->send_next, grid, 0, nlocal->x, nlocal->y - nn, nlocal->y, lo->z, hi->z);
  #endif
  timer_stop(PACKX);
  COMM_ISEND(mpp, nsend_prev, prev, y);
  COMM_ISEND(mpp, nsend_next, next, y);
  COMM_WAITALL(mpp);
  timer_start(UNPACKX);
  #ifdef __sw__
  unpack_brick_pre_sw_forward_x(mpp->recv_prev, grid, archdata->pack_params + UNPACK_FWD_NEG_Y);
  unpack_brick_pre_sw_forward_x(mpp->recv_next, grid, archdata->pack_params + UNPACK_FWD_POS_Y);
  #else
  unpack_brick_forward_x(mpp->recv_prev, grid, 0, nlocal->x, lo->y, lo->y + nn, lo->z, hi->z);
  unpack_brick_forward_x(mpp->recv_next, grid, 0, nlocal->x, hi->y - nn, hi->y, lo->z, hi->z);
  #endif
  timer_stop(UNPACKX);
  COMM_IRECV(mpp, prev, x);
  COMM_IRECV(mpp, next, x);
  timer_start(PACKX);
  #ifdef __sw__
  // puts("PACKX");
  nsend_prev = pack_brick_pre_sw_forward_x(mpp->send_prev, grid, archdata->pack_params + PACK_FWD_NEG_X);
  nsend_next = pack_brick_pre_sw_forward_x(mpp->send_next, grid, archdata->pack_params + PACK_FWD_POS_X);
  #else
  nsend_prev = pack_brick_forward_x(mpp->send_prev, grid, 0, nn, lo->y, hi->y, lo->z, hi->z);
  nsend_next = pack_brick_forward_x(mpp->send_next, grid, nlocal->x - nn, nlocal->x, lo->y, hi->y, lo->z, hi->z);
  #endif
  timer_stop(PACKX);
  COMM_ISEND(mpp, nsend_prev, prev, x);
  COMM_ISEND(mpp, nsend_next, next, x);
  COMM_WAITALL(mpp);
  timer_start(UNPACKX);
  #ifdef __sw__
  // puts("UNPACKX");
  unpack_brick_pre_sw_forward_x(mpp->recv_prev, grid, archdata->pack_params + UNPACK_FWD_NEG_X);
  unpack_brick_pre_sw_forward_x(mpp->recv_next, grid, archdata->pack_params + UNPACK_FWD_POS_X);
  #else
  unpack_brick_forward_x(mpp->recv_prev, grid, lo->x, lo->x + nn, lo->y, hi->y, lo->z, hi->z);
  unpack_brick_forward_x(mpp->recv_next, grid, hi->x - nn, hi->x, lo->y, hi->y, lo->z, hi->z);
  #endif
  timer_stop(UNPACKX);
}

size_t pack_cell_reverse_f(char *buf, celldata_t *cell){
  char *ptr = buf;
  pack_field(ptr, cell->f, cell->natom);
  return ptr - buf;
}
//['f']
size_t unpack_cell_reverse_f(char *buf, celldata_t *cell){
  char *ptr = buf;
  vec<real> *f_buf = (vec<real>*)ptr;
  for (int i = 0; i < cell->natom; i ++){
    vecaddv(cell->f[i], cell->f[i], f_buf[i]);
  }
  ptr += sizeof(*(cell->f)) * cell->natom;
  return ptr - buf;
}

size_t pack_brick_reverse_f(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  char *ptr = buf;
  for (int i = xlo; i < xhi; i ++) {
    for (int j = ylo; j < yhi; j ++) {
      for (int k = zlo; k < zhi; k ++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += pack_cell_reverse_f(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

size_t unpack_brick_reverse_f(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  char *ptr = buf;
  for (int i = xlo; i < xhi; i ++) {
    for (int j = ylo; j < yhi; j ++) {
      for (int k = zlo; k < zhi; k ++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += unpack_cell_reverse_f(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

void reverse_comm_f(cellgrid_t *grid, mpp_t *mpp) {
  vec<int> *lo = &(grid->dim.lo);
  vec<int> *hi = &(grid->dim.hi);
  vec<int> *nlocal = &(grid->nlocal);
  int nn = grid->nn;
  #ifdef __sw__
  sw_archdata_t *archdata = (sw_archdata_t*)grid->arch_data;
  #endif
  size_t nsend_prev, nsend_next;

  COMM_IRECV(mpp, prev, x);
  COMM_IRECV(mpp, next, x);
  #ifdef __sw__
  nsend_prev = pack_brick_pre_sw_reverse_f(mpp->send_prev, grid, archdata->pack_params + PACK_REV_NEG_X);
  nsend_next = pack_brick_pre_sw_reverse_f(mpp->send_next, grid, archdata->pack_params + PACK_REV_POS_X);
  #else
  nsend_prev = pack_brick_reverse_f(mpp->send_prev, grid, lo->x, lo->x + nn, lo->y, hi->y, lo->z, hi->z);
  nsend_next = pack_brick_reverse_f(mpp->send_next, grid, hi->x - nn, hi->x, lo->y, hi->y, lo->z, hi->z);
  #endif
  COMM_ISEND(mpp, nsend_prev, prev, x);
  COMM_ISEND(mpp, nsend_next, next, x);
  COMM_WAITALL(mpp);
  #ifdef __sw__
  unpack_brick_pre_sw_reverse_f(mpp->recv_prev, grid, archdata->pack_params + UNPACK_REV_NEG_X);
  unpack_brick_pre_sw_reverse_f(mpp->recv_next, grid, archdata->pack_params + UNPACK_REV_POS_X);
  #else
  unpack_brick_reverse_f(mpp->recv_prev, grid, 0, nn, lo->y, hi->y, lo->z, hi->z);
  unpack_brick_reverse_f(mpp->recv_next, grid, nlocal->x - nn, nlocal->x, lo->y, hi->y, lo->z, hi->z);
  #endif

  COMM_IRECV(mpp, prev, y);
  COMM_IRECV(mpp, next, y);
  #ifdef __sw__
  nsend_prev = pack_brick_pre_sw_reverse_f(mpp->send_prev, grid, archdata->pack_params + PACK_REV_NEG_Y);
  nsend_next = pack_brick_pre_sw_reverse_f(mpp->send_next, grid, archdata->pack_params + PACK_REV_POS_Y);
  #else
  nsend_prev = pack_brick_reverse_f(mpp->send_prev, grid, 0, nlocal->x, lo->y, lo->y + nn, lo->z, hi->z);
  nsend_next = pack_brick_reverse_f(mpp->send_next, grid, 0, nlocal->x, hi->y - nn, hi->y, lo->z, hi->z);
  #endif
  COMM_ISEND(mpp, nsend_prev, prev, y);
  COMM_ISEND(mpp, nsend_next, next, y);
  COMM_WAITALL(mpp);
  #ifdef __sw__
  unpack_brick_pre_sw_reverse_f(mpp->recv_prev, grid, archdata->pack_params + UNPACK_REV_NEG_Y);
  unpack_brick_pre_sw_reverse_f(mpp->recv_next, grid, archdata->pack_params + UNPACK_REV_POS_Y);
  #else
  unpack_brick_reverse_f(mpp->recv_prev, grid, 0, nlocal->x, 0, nn, lo->z, hi->z);
  unpack_brick_reverse_f(mpp->recv_next, grid, 0, nlocal->x, nlocal->y - nn, nlocal->y, lo->z, hi->z);
  #endif
  COMM_IRECV(mpp, prev, z);
  COMM_IRECV(mpp, next, z);
  #ifdef __sw__
  nsend_prev = pack_brick_pre_sw_reverse_f(mpp->send_prev, grid, archdata->pack_params + PACK_REV_NEG_Z);
  nsend_next = pack_brick_pre_sw_reverse_f(mpp->send_next, grid, archdata->pack_params + PACK_REV_POS_Z);
  #else
  nsend_prev = pack_brick_reverse_f(mpp->send_prev, grid, 0, nlocal->x, 0, nlocal->y, lo->z, lo->z + nn);
  nsend_next = pack_brick_reverse_f(mpp->send_next, grid, 0, nlocal->x, 0, nlocal->y, hi->z - nn, hi->z);
  #endif
  COMM_ISEND(mpp, nsend_prev, prev, z);
  COMM_ISEND(mpp, nsend_next, next, z);
  COMM_WAITALL(mpp);
  #ifdef __sw__
  unpack_brick_pre_sw_reverse_f(mpp->recv_prev, grid, archdata->pack_params + UNPACK_REV_NEG_Z);
  unpack_brick_pre_sw_reverse_f(mpp->recv_next, grid, archdata->pack_params + UNPACK_REV_POS_Z);
  #else
  unpack_brick_reverse_f(mpp->recv_prev, grid, 0, nlocal->x, 0, nlocal->y, 0, nn);
  unpack_brick_reverse_f(mpp->recv_next, grid, 0, nlocal->x, 0, nlocal->y, nlocal->z - nn, nlocal->z);
  #endif
}

size_t pack_cell_forward_v(char *buf, celldata_t *cell){
  char *ptr = buf;
  pack_field(ptr, cell->v, cell->natom);
  return ptr - buf;
}
//['v']
size_t unpack_cell_forward_v(char *buf, celldata_t *cell){
  char *ptr = buf;
  unpack_field(ptr, cell->v, cell->natom);
  return ptr - buf;
}

size_t pack_brick_forward_v(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  char *ptr = buf;
  for (int i = xlo; i < xhi; i ++) {
    for (int j = ylo; j < yhi; j ++) {
      for (int k = zlo; k < zhi; k ++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += pack_cell_forward_v(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

size_t unpack_brick_forward_v(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  char *ptr = buf;
  for (int i = xlo; i < xhi; i ++) {
    for (int j = ylo; j < yhi; j ++) {
      for (int k = zlo; k < zhi; k ++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += unpack_cell_forward_v(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

void forward_comm_v(cellgrid_t *grid, mpp_t *mpp) {
  vec<int> *lo = &(grid->dim.lo);
  vec<int> *hi = &(grid->dim.hi);
  vec<int> *nlocal = &(grid->nlocal);
  int nn = grid->nn;

  size_t nsend_prev, nsend_next;

  COMM_IRECV(mpp, prev, z);
  COMM_IRECV(mpp, next, z);
  nsend_prev = pack_brick_forward_v(mpp->send_prev, grid, 0, nlocal->x, 0, nlocal->y, 0, nn);
  nsend_next = pack_brick_forward_v(mpp->send_next, grid, 0, nlocal->x, 0, nlocal->y, nlocal->z - nn, nlocal->z);
  COMM_ISEND(mpp, nsend_prev, prev, z);
  COMM_ISEND(mpp, nsend_next, next, z);
  COMM_WAITALL(mpp);
  unpack_brick_forward_v(mpp->recv_prev, grid, 0, nlocal->x, 0, nlocal->y, lo->z, lo->z + nn);
  unpack_brick_forward_v(mpp->recv_next, grid, 0, nlocal->x, 0, nlocal->y, hi->z - nn, hi->z);

  COMM_IRECV(mpp, prev, y);
  COMM_IRECV(mpp, next, y);
  nsend_prev = pack_brick_forward_v(mpp->send_prev, grid, 0, nlocal->x, 0, nn, lo->z, hi->z);
  nsend_next = pack_brick_forward_v(mpp->send_next, grid, 0, nlocal->x, nlocal->y - nn, nlocal->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, y);
  COMM_ISEND(mpp, nsend_next, next, y);
  COMM_WAITALL(mpp);
  unpack_brick_forward_v(mpp->recv_prev, grid, 0, nlocal->x, lo->y, lo->y + nn, lo->z, hi->z);
  unpack_brick_forward_v(mpp->recv_next, grid, 0, nlocal->x, hi->y - nn, hi->y, lo->z, hi->z);

  COMM_IRECV(mpp, prev, x);
  COMM_IRECV(mpp, next, x);
  nsend_prev = pack_brick_forward_v(mpp->send_prev, grid, 0, nn, lo->y, hi->y, lo->z, hi->z);
  nsend_next = pack_brick_forward_v(mpp->send_next, grid, nlocal->x - nn, nlocal->x, lo->y, hi->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, x);
  COMM_ISEND(mpp, nsend_next, next, x);
  COMM_WAITALL(mpp);
  unpack_brick_forward_v(mpp->recv_prev, grid, lo->x, lo->x + nn, lo->y, hi->y, lo->z, hi->z);
  unpack_brick_forward_v(mpp->recv_next, grid, hi->x - nn, hi->x, lo->y, hi->y, lo->z, hi->z);
}

size_t pack_cell_reverse_v(char *buf, celldata_t *cell){
  char *ptr = buf;
  pack_field(ptr, cell->v, cell->natom);
  return ptr - buf;
}
//['v']
size_t unpack_cell_reverse_v(char *buf, celldata_t *cell){
  char *ptr = buf;
  vec<real> *v_buf = (vec<real>*)ptr;
  for (int i = 0; i < cell->natom; i ++){
    vecaddv(cell->v[i], cell->v[i], v_buf[i]);
  }
  ptr += sizeof(*(cell->v)) * cell->natom;
  return ptr - buf;
}

size_t pack_brick_reverse_v(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  char *ptr = buf;
  for (int i = xlo; i < xhi; i ++) {
    for (int j = ylo; j < yhi; j ++) {
      for (int k = zlo; k < zhi; k ++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += pack_cell_reverse_v(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

size_t unpack_brick_reverse_v(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  char *ptr = buf;
  for (int i = xlo; i < xhi; i ++) {
    for (int j = ylo; j < yhi; j ++) {
      for (int k = zlo; k < zhi; k ++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += unpack_cell_reverse_v(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

void reverse_comm_v(cellgrid_t *grid, mpp_t *mpp) {
  vec<int> *lo = &(grid->dim.lo);
  vec<int> *hi = &(grid->dim.hi);
  vec<int> *nlocal = &(grid->nlocal);
  int nn = grid->nn;

  size_t nsend_prev, nsend_next;

  COMM_IRECV(mpp, prev, x);
  COMM_IRECV(mpp, next, x);
  nsend_prev = pack_brick_reverse_v(mpp->send_prev, grid, lo->x, lo->x + nn, lo->y, hi->y, lo->z, hi->z);
  nsend_next = pack_brick_reverse_v(mpp->send_next, grid, hi->x - nn, hi->x, lo->y, hi->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, x);
  COMM_ISEND(mpp, nsend_next, next, x);
  COMM_WAITALL(mpp);
  unpack_brick_reverse_v(mpp->recv_prev, grid, 0, nn, lo->y, hi->y, lo->z, hi->z);
  unpack_brick_reverse_v(mpp->recv_next, grid, nlocal->x - nn, nlocal->x, lo->y, hi->y, lo->z, hi->z);

  COMM_IRECV(mpp, prev, y);
  COMM_IRECV(mpp, next, y);
  nsend_prev = pack_brick_reverse_v(mpp->send_prev, grid, 0, nlocal->x, lo->y, lo->y + nn, lo->z, hi->z);
  nsend_next = pack_brick_reverse_v(mpp->send_next, grid, 0, nlocal->x, hi->y - nn, hi->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, y);
  COMM_ISEND(mpp, nsend_next, next, y);
  COMM_WAITALL(mpp);
  unpack_brick_reverse_v(mpp->recv_prev, grid, 0, nlocal->x, 0, nn, lo->z, hi->z);
  unpack_brick_reverse_v(mpp->recv_next, grid, 0, nlocal->x, nlocal->y - nn, nlocal->y, lo->z, hi->z);

  COMM_IRECV(mpp, prev, z);
  COMM_IRECV(mpp, next, z);
  nsend_prev = pack_brick_reverse_v(mpp->send_prev, grid, 0, nlocal->x, 0, nlocal->y, lo->z, lo->z + nn);
  nsend_next = pack_brick_reverse_v(mpp->send_next, grid, 0, nlocal->x, 0, nlocal->y, hi->z - nn, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, z);
  COMM_ISEND(mpp, nsend_next, next, z);
  COMM_WAITALL(mpp);
  unpack_brick_reverse_v(mpp->recv_prev, grid, 0, nlocal->x, 0, nlocal->y, 0, nn);
  unpack_brick_reverse_v(mpp->recv_next, grid, 0, nlocal->x, 0, nlocal->y, nlocal->z - nn, nlocal->z);
}

__attribute__((noinline)) size_t pack_cell_forward_export_list(char *buf, celldata_t *cell){
  *(long*)buf = cell->nexport;
  char *ptr = buf + 8;
  pack_field(ptr, cell->x + (CELL_CAP - cell->nexport), cell->nexport);
  pack_field(ptr, cell->q + (CELL_CAP - cell->nexport), cell->nexport);
  pack_field(ptr, cell->tag + (CELL_CAP - cell->nexport), cell->nexport);
  pack_field(ptr, cell->t + (CELL_CAP - cell->nexport), cell->nexport);
  pack_field(ptr, cell->v + (CELL_CAP - cell->nexport), cell->nexport);
  pack_field(ptr, cell->mass + (CELL_CAP - cell->nexport), cell->nexport);
  return ptr - buf;
}
//['x', 'q', 'tag', 't', 'v', 'mass', 'nbonded_export', 'nchain2_export', 'nscal_export', 'nexcl_export', 'nimpr_export', 'first_bonded', 'first_chain2', 'first_scal_atom', 'first_excl_atom', 'first_impr', 'bonded_tag', 'chain2_tag', 'excl_tag', 'scal_tag', 'impr_idx', 'shake']
size_t unpack_cell_forward_export_list(char *buf, celldata_t *cell){
  cell->nexport = *(long*)buf;
  char *ptr = buf + 8;
  unpack_field(ptr, cell->x + (CELL_CAP - cell->nexport), cell->nexport);
  unpack_field(ptr, cell->q + (CELL_CAP - cell->nexport), cell->nexport);
  unpack_field(ptr, cell->tag + (CELL_CAP - cell->nexport), cell->nexport);
  unpack_field(ptr, cell->t + (CELL_CAP - cell->nexport), cell->nexport);
  unpack_field(ptr, cell->v + (CELL_CAP - cell->nexport), cell->nexport);
  unpack_field(ptr, cell->mass + (CELL_CAP - cell->nexport), cell->nexport);
  return ptr - buf;
}

size_t pack_brick_forward_export_list(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  char *ptr = buf;
  for (int i = xlo; i < xhi; i ++) {
    for (int j = ylo; j < yhi; j ++) {
      for (int k = zlo; k < zhi; k ++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += pack_cell_forward_export_list(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

size_t unpack_brick_forward_export_list(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  char *ptr = buf;
  for (int i = xlo; i < xhi; i ++) {
    for (int j = ylo; j < yhi; j ++) {
      for (int k = zlo; k < zhi; k ++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += unpack_cell_forward_export_list(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

void forward_comm_export_list(cellgrid_t *grid, mpp_t *mpp) {
  vec<int> *lo = &(grid->dim.lo);
  vec<int> *hi = &(grid->dim.hi);
  vec<int> *nlocal = &(grid->nlocal);
  int nn = grid->nn;

  size_t nsend_prev, nsend_next;

  COMM_IRECV(mpp, prev, z);
  COMM_IRECV(mpp, next, z);
  #ifdef __sw__
  #define pack_brick_forward_export_list pack_brick_export_sw
  #define unpack_brick_forward_export_list unpack_brick_export_sw
  #endif

  nsend_prev = pack_brick_forward_export_list(mpp->send_prev, grid, 0, nlocal->x, 0, nlocal->y, 0, nn);
  nsend_next = pack_brick_forward_export_list(mpp->send_next, grid, 0, nlocal->x, 0, nlocal->y, nlocal->z - nn, nlocal->z);
  COMM_ISEND(mpp, nsend_prev, prev, z);
  COMM_ISEND(mpp, nsend_next, next, z);
  COMM_WAITALL(mpp);

  unpack_brick_forward_export_list(mpp->recv_prev, grid, 0, nlocal->x, 0, nlocal->y, lo->z, lo->z + nn);
  unpack_brick_forward_export_list(mpp->recv_next, grid, 0, nlocal->x, 0, nlocal->y, hi->z - nn, hi->z);

  COMM_IRECV(mpp, prev, y);
  COMM_IRECV(mpp, next, y);
  nsend_prev = pack_brick_forward_export_list(mpp->send_prev, grid, 0, nlocal->x, 0, nn, lo->z, hi->z);
  nsend_next = pack_brick_forward_export_list(mpp->send_next, grid, 0, nlocal->x, nlocal->y - nn, nlocal->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, y);
  COMM_ISEND(mpp, nsend_next, next, y);
  COMM_WAITALL(mpp);
  unpack_brick_forward_export_list(mpp->recv_prev, grid, 0, nlocal->x, lo->y, lo->y + nn, lo->z, hi->z);
  unpack_brick_forward_export_list(mpp->recv_next, grid, 0, nlocal->x, hi->y - nn, hi->y, lo->z, hi->z);

  COMM_IRECV(mpp, prev, x);
  COMM_IRECV(mpp, next, x);
  nsend_prev = pack_brick_forward_export_list(mpp->send_prev, grid, 0, nn, lo->y, hi->y, lo->z, hi->z);
  nsend_next = pack_brick_forward_export_list(mpp->send_next, grid, nlocal->x - nn, nlocal->x, lo->y, hi->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, x);
  COMM_ISEND(mpp, nsend_next, next, x);
  COMM_WAITALL(mpp);
  unpack_brick_forward_export_list(mpp->recv_prev, grid, lo->x, lo->x + nn, lo->y, hi->y, lo->z, hi->z);
  unpack_brick_forward_export_list(mpp->recv_next, grid, hi->x - nn, hi->x, lo->y, hi->y, lo->z, hi->z);
}

size_t pack_cell_forward_export_list_cg(char *buf, celldata_t *cell){
  *(long*)buf = cell->nexport;
  char *ptr = buf + 8;
  pack_field(ptr, cell->x + (CELL_CAP - cell->nexport), cell->nexport);
  pack_field(ptr, cell->q + (CELL_CAP - cell->nexport), cell->nexport);
  pack_field(ptr, cell->tag + (CELL_CAP - cell->nexport), cell->nexport);
  pack_field(ptr, cell->t + (CELL_CAP - cell->nexport), cell->nexport);
  pack_field(ptr, cell->v + (CELL_CAP - cell->nexport), cell->nexport);
  pack_field(ptr, cell->mass + (CELL_CAP - cell->nexport), cell->nexport);

  pack_field(ptr, cell->shake_tmp + (CELL_CAP - cell->nexport), cell->nexport);
  return ptr - buf;
}
//['x', 'q', 'tag', 't', 'v', 'mass', 'nbonded_export', 'nchain2_export', 'nscal_export', 'nexcl_export', 'nimpr_export', 'first_bonded', 'first_chain2', 'first_scal_atom', 'first_excl_atom', 'first_impr', 'bonded_tag', 'chain2_tag', 'excl_tag', 'scal_tag', 'impr_idx', 'shake', 'shake_tmp']
size_t unpack_cell_forward_export_list_cg(char *buf, celldata_t *cell){
  cell->nexport = *(long*)buf;
  char *ptr = buf + 8;
  unpack_field(ptr, cell->x + (CELL_CAP - cell->nexport), cell->nexport);
  unpack_field(ptr, cell->q + (CELL_CAP - cell->nexport), cell->nexport);
  unpack_field(ptr, cell->tag + (CELL_CAP - cell->nexport), cell->nexport);
  unpack_field(ptr, cell->t + (CELL_CAP - cell->nexport), cell->nexport);
  unpack_field(ptr, cell->v + (CELL_CAP - cell->nexport), cell->nexport);
  unpack_field(ptr, cell->mass + (CELL_CAP - cell->nexport), cell->nexport);

  unpack_field(ptr, cell->shake_tmp + (CELL_CAP - cell->nexport), cell->nexport);
  return ptr - buf;
}

size_t pack_brick_forward_export_list_cg(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  char *ptr = buf;
  for (int i = xlo; i < xhi; i ++) {
    for (int j = ylo; j < yhi; j ++) {
      for (int k = zlo; k < zhi; k ++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += pack_cell_forward_export_list_cg(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

size_t unpack_brick_forward_export_list_cg(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  char *ptr = buf;
  for (int i = xlo; i < xhi; i ++) {
    for (int j = ylo; j < yhi; j ++) {
      for (int k = zlo; k < zhi; k ++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += unpack_cell_forward_export_list_cg(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

void forward_comm_export_list_cg(cellgrid_t *grid, mpp_t *mpp) {
  vec<int> *lo = &(grid->dim.lo);
  vec<int> *hi = &(grid->dim.hi);
  vec<int> *nlocal = &(grid->nlocal);
  int nn = grid->nn;

  size_t nsend_prev, nsend_next;

  COMM_IRECV(mpp, prev, z);
  COMM_IRECV(mpp, next, z);
  nsend_prev = pack_brick_forward_export_list_cg(mpp->send_prev, grid, 0, nlocal->x, 0, nlocal->y, 0, nn);
  nsend_next = pack_brick_forward_export_list_cg(mpp->send_next, grid, 0, nlocal->x, 0, nlocal->y, nlocal->z - nn, nlocal->z);
  COMM_ISEND(mpp, nsend_prev, prev, z);
  COMM_ISEND(mpp, nsend_next, next, z);
  COMM_WAITALL(mpp);
  unpack_brick_forward_export_list_cg(mpp->recv_prev, grid, 0, nlocal->x, 0, nlocal->y, lo->z, lo->z + nn);
  unpack_brick_forward_export_list_cg(mpp->recv_next, grid, 0, nlocal->x, 0, nlocal->y, hi->z - nn, hi->z);

  COMM_IRECV(mpp, prev, y);
  COMM_IRECV(mpp, next, y);
  nsend_prev = pack_brick_forward_export_list_cg(mpp->send_prev, grid, 0, nlocal->x, 0, nn, lo->z, hi->z);
  nsend_next = pack_brick_forward_export_list_cg(mpp->send_next, grid, 0, nlocal->x, nlocal->y - nn, nlocal->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, y);
  COMM_ISEND(mpp, nsend_next, next, y);
  COMM_WAITALL(mpp);
  unpack_brick_forward_export_list_cg(mpp->recv_prev, grid, 0, nlocal->x, lo->y, lo->y + nn, lo->z, hi->z);
  unpack_brick_forward_export_list_cg(mpp->recv_next, grid, 0, nlocal->x, hi->y - nn, hi->y, lo->z, hi->z);

  COMM_IRECV(mpp, prev, x);
  COMM_IRECV(mpp, next, x);
  nsend_prev = pack_brick_forward_export_list_cg(mpp->send_prev, grid, 0, nn, lo->y, hi->y, lo->z, hi->z);
  nsend_next = pack_brick_forward_export_list_cg(mpp->send_next, grid, nlocal->x - nn, nlocal->x, lo->y, hi->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, x);
  COMM_ISEND(mpp, nsend_next, next, x);
  COMM_WAITALL(mpp);
  unpack_brick_forward_export_list_cg(mpp->recv_prev, grid, lo->x, lo->x + nn, lo->y, hi->y, lo->z, hi->z);
  unpack_brick_forward_export_list_cg(mpp->recv_next, grid, hi->x - nn, hi->x, lo->y, hi->y, lo->z, hi->z);
}

size_t pack_cell_forward_shake(char *buf, celldata_t *cell){
  char *ptr = buf;
  pack_field(ptr, cell->shake_tmp, cell->natom);
  return ptr - buf;
}
//['shake_tmp']
size_t unpack_cell_forward_shake(char *buf, celldata_t *cell){
  char *ptr = buf;
  unpack_field(ptr, cell->shake_tmp, cell->natom);
  return ptr - buf;
}

size_t pack_brick_forward_shake(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  char *ptr = buf;
  for (int i = xlo; i < xhi; i ++) {
    for (int j = ylo; j < yhi; j ++) {
      for (int k = zlo; k < zhi; k ++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += pack_cell_forward_shake(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

size_t unpack_brick_forward_shake(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  char *ptr = buf;
  for (int i = xlo; i < xhi; i ++) {
    for (int j = ylo; j < yhi; j ++) {
      for (int k = zlo; k < zhi; k ++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += unpack_cell_forward_shake(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

void forward_comm_shake(cellgrid_t *grid, mpp_t *mpp) {
  vec<int> *lo = &(grid->dim.lo);
  vec<int> *hi = &(grid->dim.hi);
  vec<int> *nlocal = &(grid->nlocal);
  int nn = grid->nn;
  #ifdef __sw__
  sw_archdata_t *archdata = (sw_archdata_t*)grid->arch_data;
  #endif
  size_t nsend_prev, nsend_next;

  COMM_IRECV(mpp, prev, z);
  COMM_IRECV(mpp, next, z);
  #ifdef __sw__
  nsend_prev = pack_brick_pre_sw_forward_shake(mpp->send_prev, grid, archdata->pack_params + PACK_FWD_NEG_Z);
  nsend_next = pack_brick_pre_sw_forward_shake(mpp->send_next, grid, archdata->pack_params + PACK_FWD_POS_Z);
  #else
  nsend_prev = pack_brick_forward_shake(mpp->send_prev, grid, 0, nlocal->x, 0, nlocal->y, 0, nn);
  nsend_next = pack_brick_forward_shake(mpp->send_next, grid, 0, nlocal->x, 0, nlocal->y, nlocal->z - nn, nlocal->z);
  #endif
  COMM_ISEND(mpp, nsend_prev, prev, z);
  COMM_ISEND(mpp, nsend_next, next, z);
  COMM_WAITALL(mpp);
  #ifdef __sw__
  unpack_brick_pre_sw_forward_shake(mpp->recv_prev, grid, archdata->pack_params + UNPACK_FWD_NEG_Z);
  unpack_brick_pre_sw_forward_shake(mpp->recv_next, grid, archdata->pack_params + UNPACK_FWD_POS_Z);
  #else
  unpack_brick_forward_shake(mpp->recv_prev, grid, 0, nlocal->x, 0, nlocal->y, lo->z, lo->z + nn);
  unpack_brick_forward_shake(mpp->recv_next, grid, 0, nlocal->x, 0, nlocal->y, hi->z - nn, hi->z);
  #endif
  COMM_IRECV(mpp, prev, y);
  COMM_IRECV(mpp, next, y);
  #ifdef __sw__
  nsend_prev = pack_brick_pre_sw_forward_shake(mpp->send_prev, grid, archdata->pack_params + PACK_FWD_NEG_Y);
  nsend_next = pack_brick_pre_sw_forward_shake(mpp->send_next, grid, archdata->pack_params + PACK_FWD_POS_Y);
  #else
  nsend_prev = pack_brick_forward_shake(mpp->send_prev, grid, 0, nlocal->x, 0, nn, lo->z, hi->z);
  nsend_next = pack_brick_forward_shake(mpp->send_next, grid, 0, nlocal->x, nlocal->y - nn, nlocal->y, lo->z, hi->z);
  #endif
  COMM_ISEND(mpp, nsend_prev, prev, y);
  COMM_ISEND(mpp, nsend_next, next, y);
  COMM_WAITALL(mpp);
  #ifdef __sw__
  unpack_brick_pre_sw_forward_shake(mpp->recv_prev, grid, archdata->pack_params + UNPACK_FWD_NEG_Y);
  unpack_brick_pre_sw_forward_shake(mpp->recv_next, grid, archdata->pack_params + UNPACK_FWD_POS_Y);
  #else
  unpack_brick_forward_shake(mpp->recv_prev, grid, 0, nlocal->x, lo->y, lo->y + nn, lo->z, hi->z);
  unpack_brick_forward_shake(mpp->recv_next, grid, 0, nlocal->x, hi->y - nn, hi->y, lo->z, hi->z);
  #endif
  COMM_IRECV(mpp, prev, x);
  COMM_IRECV(mpp, next, x);
  #ifdef __sw__
  nsend_prev = pack_brick_pre_sw_forward_shake(mpp->send_prev, grid, archdata->pack_params + PACK_FWD_NEG_X);
  nsend_next = pack_brick_pre_sw_forward_shake(mpp->send_next, grid, archdata->pack_params + PACK_FWD_POS_X);
  #else
  nsend_prev = pack_brick_forward_shake(mpp->send_prev, grid, 0, nn, lo->y, hi->y, lo->z, hi->z);
  nsend_next = pack_brick_forward_shake(mpp->send_next, grid, nlocal->x - nn, nlocal->x, lo->y, hi->y, lo->z, hi->z);
  #endif
  COMM_ISEND(mpp, nsend_prev, prev, x);
  COMM_ISEND(mpp, nsend_next, next, x);
  COMM_WAITALL(mpp);
  #ifdef __sw__
  // puts("UNPACKX");
  unpack_brick_pre_sw_forward_shake(mpp->recv_prev, grid, archdata->pack_params + UNPACK_FWD_NEG_X);
  unpack_brick_pre_sw_forward_shake(mpp->recv_next, grid, archdata->pack_params + UNPACK_FWD_POS_X);
  #else
  unpack_brick_forward_shake(mpp->recv_prev, grid, lo->x, lo->x + nn, lo->y, hi->y, lo->z, hi->z);
  unpack_brick_forward_shake(mpp->recv_next, grid, hi->x - nn, hi->x, lo->y, hi->y, lo->z, hi->z);
  #endif
}

size_t pack_cell_reverse_shake(char *buf, celldata_t *cell){
  char *ptr = buf;
  pack_field(ptr, cell->shake_tmp, cell->natom);
  return ptr - buf;
}
//['shake_tmp']
size_t unpack_cell_reverse_shake(char *buf, celldata_t *cell){
  char *ptr = buf;
  vec<real> *shake_tmp_buf = (vec<real>*)ptr;
  for (int i = 0; i < cell->natom; i ++){
    vecaddv(cell->shake_tmp[i], cell->shake_tmp[i], shake_tmp_buf[i]);
  }
  ptr += sizeof(*(cell->shake_tmp)) * cell->natom;
  return ptr - buf;
}

size_t pack_brick_reverse_shake(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  char *ptr = buf;
  for (int i = xlo; i < xhi; i ++) {
    for (int j = ylo; j < yhi; j ++) {
      for (int k = zlo; k < zhi; k ++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += pack_cell_reverse_shake(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

size_t unpack_brick_reverse_shake(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  char *ptr = buf;
  for (int i = xlo; i < xhi; i ++) {
    for (int j = ylo; j < yhi; j ++) {
      for (int k = zlo; k < zhi; k ++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += unpack_cell_reverse_shake(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

void reverse_comm_shake(cellgrid_t *grid, mpp_t *mpp) {
  vec<int> *lo = &(grid->dim.lo);
  vec<int> *hi = &(grid->dim.hi);
  vec<int> *nlocal = &(grid->nlocal);
  int nn = grid->nn;

  size_t nsend_prev, nsend_next;

  COMM_IRECV(mpp, prev, x);
  COMM_IRECV(mpp, next, x);
  nsend_prev = pack_brick_reverse_shake(mpp->send_prev, grid, lo->x, lo->x + nn, lo->y, hi->y, lo->z, hi->z);
  nsend_next = pack_brick_reverse_shake(mpp->send_next, grid, hi->x - nn, hi->x, lo->y, hi->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, x);
  COMM_ISEND(mpp, nsend_next, next, x);
  COMM_WAITALL(mpp);
  unpack_brick_reverse_shake(mpp->recv_prev, grid, 0, nn, lo->y, hi->y, lo->z, hi->z);
  unpack_brick_reverse_shake(mpp->recv_next, grid, nlocal->x - nn, nlocal->x, lo->y, hi->y, lo->z, hi->z);

  COMM_IRECV(mpp, prev, y);
  COMM_IRECV(mpp, next, y);
  nsend_prev = pack_brick_reverse_shake(mpp->send_prev, grid, 0, nlocal->x, lo->y, lo->y + nn, lo->z, hi->z);
  nsend_next = pack_brick_reverse_shake(mpp->send_next, grid, 0, nlocal->x, hi->y - nn, hi->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, y);
  COMM_ISEND(mpp, nsend_next, next, y);
  COMM_WAITALL(mpp);
  unpack_brick_reverse_shake(mpp->recv_prev, grid, 0, nlocal->x, 0, nn, lo->z, hi->z);
  unpack_brick_reverse_shake(mpp->recv_next, grid, 0, nlocal->x, nlocal->y - nn, nlocal->y, lo->z, hi->z);

  COMM_IRECV(mpp, prev, z);
  COMM_IRECV(mpp, next, z);
  nsend_prev = pack_brick_reverse_shake(mpp->send_prev, grid, 0, nlocal->x, 0, nlocal->y, lo->z, lo->z + nn);
  nsend_next = pack_brick_reverse_shake(mpp->send_next, grid, 0, nlocal->x, 0, nlocal->y, hi->z - nn, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, z);
  COMM_ISEND(mpp, nsend_next, next, z);
  COMM_WAITALL(mpp);
  unpack_brick_reverse_shake(mpp->recv_prev, grid, 0, nlocal->x, 0, nlocal->y, 0, nn);
  unpack_brick_reverse_shake(mpp->recv_next, grid, 0, nlocal->x, 0, nlocal->y, nlocal->z - nn, nlocal->z);
}

